import pandas as pd
import matplotlib.pyplot as plt
import datetime
from datetime import datetime, timedelta
import numpy as np
import plotly.offline as pyo
import plotly.graph_objs as go
from plotly import tools
pyo.init_notebook_mode(connected=True)
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from sklearn.cluster import KMeans
import geopandas as gpd
import folium
from shapely.geometry import Point
import shapely.wkt
from shapely.geometry import Polygon
import warnings
warnings.filterwarnings('ignore')
We are analyzing 1 day of taxi rides.
# load rides dataframe
df = pd.read_csv('/Users/alex/Documents/Transportation_Network_Providers_-_Trips.csv', usecols=['Trip Start Timestamp', 'Trip End Timestamp', 'Trip Seconds', 'Trip Miles', 'Pickup Census Tract', 'Dropoff Census Tract', 'Pickup Community Area', 'Dropoff Community Area', 'Fare', 'Tip', 'Additional Charges', 'Trip Total', 'Shared Trip Authorized', 'Trips Pooled', 'Pickup Centroid Latitude', 'Pickup Centroid Longitude', 'Pickup Centroid Location', 'Dropoff Centroid Latitude', 'Dropoff Centroid Longitude', 'Dropoff Centroid Location'], nrows = 300000)
# load city areas geodataframe
chicago = gpd.read_file('/Users/alex/Documents/Boundaries - Community Areas (current).geojson')
df.info()
# Create datetime columns:
df['Trip Start Timestamp'] = df['Trip Start Timestamp'].apply(lambda t: pd.to_datetime(t, format='%m/%d/%Y %I:%M:%S %p'))
df['Trip End Timestamp'] = df['Trip End Timestamp'].apply(lambda t: pd.to_datetime(t, format='%m/%d/%Y %I:%M:%S %p'))
# load city geodataframe
chicago = gpd.read_file('/Users/alex/Documents/Boundaries - Community Areas (current).geojson')
# create rides per hour during the day
rides_per_hour = pd.DataFrame(df.resample('H', on='Trip Start Timestamp').size()).reset_index()
rides_per_hour.columns = ['Timeline', 'Number of trips']
# number of rides per hour visualization
trace = go.Bar(x=rides_per_hour['Timeline'],
y=rides_per_hour['Number of trips'])
layout = go.Layout(
title='Number of Rides Per Hour',
)
data = [trace]
fig = go.Figure(data=data, layout=layout)
pyo.iplot(fig)
# create edian number of miles per ride
ride_miles_per_hour = pd.DataFrame(df.resample('H', on='Trip Start Timestamp')['Trip Miles'].median()).reset_index()
# Median miles per ride visualization
trace = go.Bar(x=ride_miles_per_hour['Trip Start Timestamp'],
y=ride_miles_per_hour['Trip Miles'])
layout = go.Layout(
title='Average Miles per Hour',
)
data = [trace]
fig = go.Figure(data=data, layout=layout)
pyo.iplot(fig)
# create median fare per trip by hour
ride_fare_per_hour = pd.DataFrame(df.resample('H', on='Trip Start Timestamp').Fare.median()).reset_index()
trace = go.Bar(x=ride_fare_per_hour['Trip Start Timestamp'],
y=ride_fare_per_hour.Fare)
layout = go.Layout(
title='Median fare per trip [USD]',
)
data = [trace]
fig = go.Figure(data=data, layout=layout)
pyo.iplot(fig)
#total fare per day
total_fare_per_day = df.Fare.sum()
total_fare_per_day
# total additional charges per day
total_add_charges_per_day = df['Additional Charges'].sum()
total_add_charges_per_day.round(2)
# total revenue
total_revenue = (total_fare_per_day + total_add_charges_per_day).round(2)
total_revenue
# revenue per hour calculation
ride_fare_per_hour = pd.DataFrame(df.resample('H', on='Trip Start Timestamp')['Fare'].sum()).reset_index()
ride_charge_per_hour = pd.DataFrame(df.resample('H', on='Trip Start Timestamp')['Additional Charges'].sum()).reset_index()
# hourly revenue visualizations
trace1 = go.Bar(
x=ride_fare_per_hour['Trip Start Timestamp'],
y=ride_fare_per_hour.Fare,
name='Fare'
)
trace2 = go.Bar(
x=ride_charge_per_hour['Trip Start Timestamp'],
y=ride_charge_per_hour['Additional Charges'],
name='Additonal Charges'
)
data = [trace1, trace2]
layout = go.Layout(
barmode='stack',
title='Revenue per Hour',
)
fig = go.Figure(data=data, layout=layout)
pyo.iplot(fig, filename='stacked-bar')
# Average velocity per ride (miles/hour)
df['velocity'] = df['Trip Miles']/(df['Trip Seconds']/3600)
# Median velocity calculation
ride_velocity = pd.DataFrame(df.resample('H', on='Trip Start Timestamp').velocity.median()).reset_index()
# rides velocity visualizations
trace = go.Bar(x=ride_velocity['Trip Start Timestamp'], y=ride_velocity.velocity)
layout = go.Layout(
title='Median Velocity [Miles/hr]',
)
data = [trace]
fig = go.Figure(data=data, layout=layout)
pyo.iplot(fig)
# revenue per minute per ride
df_test = df[df['Trip Seconds']!=0]
df_reven = df_test[df['Fare']!=0]
df_reven['rev_min'] = df_reven.Fare/(df_reven['Trip Seconds']/60).round(2)
df_reven.sort_values(by='rev_min', ascending=False).head(2)
Some inconsistency in data with large amumnt of revenue per minute. It appears that in some cases we have situation where Trip Seconds is 12 and Trip Miles is 20 which is impossible. Lets increase Trip Seconds threshold to 240 seconds.
# revenue per minute per ride
df_test = df[df['Trip Seconds']>=240]
df_reven = df_test[df['Fare']!=0]
df_reven['rev_min'] = df_reven.Fare/(df_reven['Trip Seconds']/60).round(2)
# revenue per minute per ride distribution visualization
fig = go.Figure()
fig.add_trace(go.Box(y=df_reven['rev_min']))
fig.show()
Hete I will try to figure out what trips are common in terms of city areas:
# filter our different kinds of trips
# trips from outside the city to outside city areas
null_rides = df[df['Dropoff Community Area'].isnull() & df['Pickup Community Area'].isnull()]
# trips from city areas to outside the city areas
null_dropoff = df[df['Dropoff Community Area'].isnull() & df['Pickup Community Area'].notnull()]
# trips from outside the city ares to city areas
null_pickup = df[df['Pickup Community Area'].isnull() & df['Dropoff Community Area'].notnull()]
# trips inside one area of the city
same_area = df[df['Dropoff Community Area']==df['Pickup Community Area']]
# trips between different areas of the city
differnt_area = df[df['Dropoff Community Area']!=df['Pickup Community Area']]
city_dif_area = differnt_area[~differnt_area.index.isin(null_dropoff.index)]
city_dif_area = city_dif_area[~city_dif_area.index.isin(null_pickup.index)]
city_dif_area = city_dif_area[~city_dif_area.index.isin(null_rides.index)]
same_city_area = same_area[~same_area.index.isin(null_rides.index)]
# trips kind visualization
labels = ['Rides without area destinations','Rides without dropoff','Rides without pickups','Rides between city areas', 'Rides within same city areas']
values = [len(null_rides),len(null_dropoff),len(null_pickup),len(city_dif_area), len(same_city_area)]
colors = ['#FEBFB3', '#E1396C', '#96D38C', '#D0F9B1', '#b3fec1']
trace = go.Pie(labels=labels, values=values,
hoverinfo='label+percent', textinfo='value',
textfont=dict(size=20),
marker=dict(colors=colors,
line=dict(color='#000000', width=2)))
pyo.iplot([trace], filename='styled_pie_chart')
# different kind of trips hourly distributions
null_pickup_time = pd.DataFrame(null_pickup.resample('H', on='Trip Start Timestamp').size()).reset_index()
null_pickup_time.columns = ['Trip Start Timestamp', 'Count']
null_dropoff_time = pd.DataFrame(null_dropoff.resample('H', on='Trip Start Timestamp').size()).reset_index()
null_dropoff_time.columns = ['Trip Start Timestamp', 'Count']
same_city_area_time = pd.DataFrame(same_city_area.resample('H', on='Trip Start Timestamp').size()).reset_index()
same_city_area_time.columns = ['Trip Start Timestamp', 'Count']
city_dif_area_time = pd.DataFrame(city_dif_area.resample('H', on='Trip Start Timestamp').size()).reset_index()
city_dif_area_time.columns = ['Trip Start Timestamp', 'Count']
# distribution visualizations
fig = make_subplots(rows=4, cols=1)
trace1 = go.Bar(x=null_pickup_time['Trip Start Timestamp'], y=null_pickup_time['Count'])
trace2 = go.Bar(x=null_dropoff_time['Trip Start Timestamp'], y=null_dropoff_time['Count'])
trace3 = go.Bar(x=same_city_area_time['Trip Start Timestamp'], y=same_city_area_time['Count'])
trace4 = go.Bar(x=city_dif_area_time['Trip Start Timestamp'], y=city_dif_area_time['Count'])
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 2, 1)
fig.append_trace(trace3, 3, 1)
fig.append_trace(trace4, 4, 1)
fig.update_layout = go.Layout(
title='Rides without Pickup Data',
autosize=False,
width=500,
height=1000,
)
#fig = go.Figure(data=data, layout=layout)
fig.show()
We have different distributions with outside area trips
# Number or pickups per city area
pickup_per_area = pd.DataFrame(df.groupby('Pickup Community Area').size()).reset_index()
pickup_per_area['community_area'] = pickup_per_area['Pickup Community Area'].astype(int)
pickup_per_area = pickup_per_area.drop(columns='Pickup Community Area')
pickup_per_area.columns = ['pickup_number', 'community_area']
# Number of dropoffs per city area
dropoff_per_area = pd.DataFrame(df.groupby('Dropoff Community Area').size()).reset_index()
dropoff_per_area['community_area'] = dropoff_per_area['Dropoff Community Area'].astype(int)
dropoff_per_area = dropoff_per_area.drop(columns='Dropoff Community Area')
dropoff_per_area.columns = ['dropoff_number', 'community_area']
# create city pickup and dropoff geo datapoints
chicago['community_area'] = chicago['area_num_1'].astype(int)
chicago = pd.merge(chicago, pickup_per_area, how='left', on='community_area')
chicago = pd.merge(chicago, dropoff_per_area, how='left', on='community_area')
# city coordinates
chicago_center = [41.8781, -87.6298]
pickup_map = folium.Map(location = chicago_center, zoom_start = 10)
# add points pickup points to city map
folium.Choropleth(
geo_data=chicago,
name='geometry',
data=chicago,
columns=['area_numbe', 'pickup_number'],
key_on='feature.properties.area_numbe',
fill_color='YlGn',
fill_opacity=0.8,
line_opacity=0.5,
legend_name='Pickup dencity'
).add_to(pickup_map)
folium.LayerControl().add_to(pickup_map)
display(pickup_map)
# setup quantile range for choropleth
bins = list(chicago['pickup_number'].quantile([0, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]))
# city coordinates
chicago_center = [41.8781, -87.6298]
pickup_map2 = folium.Map(location = chicago_center, zoom_start = 11)
chicago_pickup = chicago
folium.Choropleth(
geo_data=chicago_pickup,
name='geometry',
data=chicago_pickup,
columns=['area_numbe', 'pickup_number'],
key_on='feature.properties.area_numbe',
fill_color='BuPu',
fill_opacity=0.8,
line_opacity=0.5,
legend_name='Pickup dencity',
bins=bins,
reset=True
).add_to(pickup_map2)
folium.LayerControl().add_to(pickup_map2)
chicago_pickup['center'] = chicago_pickup.geometry.centroid
# adding popups with pickup information
for row in chicago_pickup.iterrows():
row_values = row[1]
center_point = row_values['center']
location = [center_point.y, center_point.x]
popup = ('Community area: ' + str(row_values['community_area']) + '-' + str(row_values['community']) +
'; ' + 'Pickups: ' + str(row_values['pickup_number']))
marker = folium.Marker(location = location, popup = popup)
marker.add_to(pickup_map2)
display(pickup_map2)
chicago_dropoff = chicago
# quantile range for dropoff choropleth
bins = list(chicago_dropoff['dropoff_number'].quantile([0, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]))
#city coordinares
chicago_center = [41.8781, -87.6298]
dropoff_map = folium.Map(location = chicago_center, zoom_start = 11)
chicago_dropoff.drop(columns ='center', inplace=True)
folium.Choropleth(
geo_data=chicago_dropoff,
name='geometry',
data=chicago_dropoff,
columns=['area_numbe', 'dropoff_number'],
key_on='feature.properties.area_numbe',
fill_color='YlGn',
fill_opacity=0.8,
line_opacity=0.5,
legend_name='Dropoff dencity',
bins=bins,
reset=True
).add_to(dropoff_map)
folium.LayerControl().add_to(dropoff_map)
chicago_dropoff['center'] = chicago_dropoff.geometry.centroid
#adding popups: number of dropoffs per city ares
for row in chicago_dropoff.iterrows():
row_values = row[1]
center_point = row_values['center']
location = [center_point.y, center_point.x]
popup = ('Community area: ' + str(row_values['community_area']) + '-' + str(row_values['community']) +
'; ' + 'Pickups: ' + str(row_values['pickup_number']))
marker = folium.Marker(location = location, popup = popup)
marker.add_to(dropoff_map)
display(dropoff_map)
# matplotlib map to compare visualizations: quantiles and equal intevals
chicago.plot(column='pickup_number', scheme='quantiles', k=10, legend=True, figsize=(20,10))
chicago.plot(column='pickup_number', scheme='equal_interval', k=10, legend=True, figsize=(20,10))
plt.show()
# create new dataframe
locationsdf = df[['Trip Start Timestamp', 'Trip Seconds', 'Trip Miles',
'Trip Total', 'Pickup Centroid Location', 'Dropoff Centroid Location']]
locationsdf = locationsdf.dropna()
pickup_df = locationsdf[['Trip Start Timestamp', 'Trip Seconds', 'Trip Miles',
'Trip Total', 'Pickup Centroid Location']]
dropoff_df = locationsdf[['Trip Start Timestamp', 'Trip Seconds', 'Trip Miles',
'Trip Total', 'Dropoff Centroid Location']]
pickup_df.info()
dropoff_df.info()
# create pickup geodataframe with pickup geopoints
geometry = pickup_df['Pickup Centroid Location'].map(shapely.wkt.loads)
pickup_df = pickup_df.drop('Pickup Centroid Location', axis=1)
crs = {'init': 'epsg:4326'}
pickup_gdf = gpd.GeoDataFrame(pickup_df, crs=crs, geometry=geometry)
# create dropoff geodataframe with dropoff geopoints
geometry = dropoff_df['Dropoff Centroid Location'].map(shapely.wkt.loads)
dropoff_df = dropoff_df.drop('Dropoff Centroid Location', axis=1)
crs = {'init': 'epsg:4326'}
dropoff_gdf = gpd.GeoDataFrame(dropoff_df, crs=crs, geometry=geometry)
# Extract max min coordinates
xmin,ymin,xmax,ymax = pickup_gdf.total_bounds
print(xmin,ymin,xmax,ymax)
# city polygon grid
width = 1/300
height = 1/300
cols = np.array(np.arange(xmin, xmax, width))
rows = np.array(np.arange(ymin, ymax, height))
polygons = []
for x in cols:
for y in rows:
polygons.append( Polygon([(x,y), (x+width, y), (x+width, y-height), (x, y-height)]) )
grid = gpd.GeoDataFrame({'geometry':polygons})
grid.head()
# set grid crs: epsg:4326
grid.crs = {'init': 'epsg:4326'}
print(grid.crs)
# adding dropoffs to city grid
dropoff_gridaera = gpd.sjoin(dropoff_gdf, grid, op = "within")
dropoff_gridaera.info()
dropoff_gridaera.tail(5) #gris number as ndex_right
# getting numner of dropoffs per grid cell
dropoff_gridnum = pd.DataFrame(dropoff_gridaera.groupby('index_right').size())
dropoff_gridnum.columns = ['number_dropoffs']
dropoff_gridnum.head()
# join grid df and number of dropoffs
grid_dropoffs = grid.join(dropoff_gridnum, how='outer')
# drop rows with empty grid cells
grid_dropoffs = grid_dropoffs.dropna()
grid_dropoffs = grid_dropoffs.reset_index()
grid_dropoffs.head()
# grid dropoff dencity visualization
grid_dropoffs.plot(column='number_dropoffs', scheme='quantiles', k=10, legend=True, figsize=(20,10))
plt.show()
# number of active grid cells
print(len(grid_dropoffs))
# create quantile range
bins = list(grid_dropoffs['number_dropoffs'].quantile([0, 0.8, 0.85, 0.89, 0.95, 0.97, 0.98, 0.99, 0.998, 1]))
# city coordinates
chicago_center = [41.8781, -87.6298]
dropoff_grid_map = folium.Map(location = chicago_center, zoom_start = 12)
folium.Choropleth(
geo_data=grid_dropoffs,
name='geometry',
data=grid_dropoffs,
columns= ['index', 'number_dropoffs'],
key_on='feature.properties.index',
fill_color='YlGn',
fill_opacity=0.8,
line_opacity=0.5,
legend_name='Dropoff dencity',
bins=bins,
reset=True
).add_to(dropoff_grid_map)
folium.LayerControl().add_to(dropoff_grid_map)
display(dropoff_grid_map)
# Load places dataframe
places = pd.read_csv('/Users/alex/Library/Mobile Documents/com~apple~CloudDocs/Datasets/Places_chicago.csv')
places.head()
# create latitude and longitude cols
places = places.join(places.Coordinates.str.split(expand=True))
places = places.drop('Coordinates', axis=1)
places.columns = ['places', 'lat', 'lng']
places.lat = places.lat.astype(float)
places.lng = places.lng.astype(float)
places.info()
# create geometry column
geometry = [Point(xy) for xy in zip (places.lng, places.lat)]
# create geodataframe with famous places
places_gdf = gpd.GeoDataFrame(places, geometry=geometry)
places_gdf.head()
# adding famous places popups to choropleth
for row in places_gdf.iterrows():
row_values = row[1]
center_point = row_values['geometry']
location = [center_point.y, center_point.x]
popup = (str(row_values['places']))
marker = folium.Marker(location = location, popup = popup)
marker.add_to(dropoff_grid_map)
display(dropoff_grid_map)
# Number of grid cells with dropoffs
df['Dropoff Centroid Location'].nunique()
grid_dropoffs_max = grid_dropoffs[grid_dropoffs.number_dropoffs > 2000]
grid_dropoffs_max.sort_values(by = 'number_dropoffs', ascending=False).head()
# drop columns from main df
df_odm = df.drop(['Trip End Timestamp', 'Pickup Census Tract', 'Dropoff Census Tract', 'Pickup Community Area', 'Dropoff Community Area',
'Shared Trip Authorized', 'Trips Pooled', 'Pickup Centroid Latitude', 'Pickup Centroid Longitude', 'Dropoff Centroid Latitude',
'Dropoff Centroid Longitude'], axis=1)
# filter out trips inside the city
df_odm_ready = df_odm[df_odm['Pickup Centroid Location'].notnull() & df_odm['Dropoff Centroid Location'].notnull()]
# create df copy for dropoff data
df_odm_ready2 = df_odm_ready.copy()
# create pickup goedataframe
geometry_p = df_odm_ready['Pickup Centroid Location'].map(shapely.wkt.loads)
crs = {'init': 'epsg:4326'}
df_odm_pick = gpd.GeoDataFrame(df_odm_ready, crs=crs, geometry=geometry_p).copy() # NOTE changes main df
# create dropoff geodataframe
geometry_d = df_odm_ready2['Dropoff Centroid Location'].map(shapely.wkt.loads)
crs = {'init': 'epsg:4326'}
df_odm_drop = gpd.GeoDataFrame(df_odm_ready, crs=crs, geometry=geometry_d)
# create grids max/min coordinates
xmin,ymin,xmax,ymax = df_odm_pick.total_bounds
print(xmin,ymin,xmax,ymax)
# create grid
width = 1/30
height = 1/30
cols = np.array(np.arange(xmin, xmax, width))
rows = np.array(np.arange(ymin, ymax, height))
polygons = []
for x in cols:
for y in rows:
polygons.append( Polygon([(x,y), (x+width, y), (x+width, y-height), (x, y-height)]) )
grid_odm = gpd.GeoDataFrame({'geometry':polygons})
grid_odm.shape
# adjust crs for the grid
grid_odm.crs = {'init': 'epsg:4326'}
print(df_odm_pick.crs)
print(df_odm_drop.crs)
# add grid cell index to pickups and dropoffs data
df_odm_pick_grid = gpd.sjoin(df_odm_pick, grid_odm, op = "within")
df_odm_drop_grid = gpd.sjoin(df_odm_drop, grid_odm, op = "within").copy()
df_odm_pick_grid.head(2)
df_odm_drop_grid.head(2)
df_odm_drop_grid = df_odm_drop_grid.drop(['Trip Start Timestamp', 'Trip Seconds', 'Trip Miles', 'Fare', 'Tip',
'Additional Charges', 'Trip Total', 'Pickup Centroid Location',
'Dropoff Centroid Location', 'velocity'], axis=1)
df_omd_grid = pd.merge(df_odm_pick_grid, df_odm_drop_grid, right_index=True, left_index=True)
df_omd_grid = df_omd_grid.drop(['Trip Start Timestamp', 'Trip Seconds', 'Trip Miles', 'Fare', 'Tip',
'Additional Charges', 'Trip Total', 'Pickup Centroid Location',
'Dropoff Centroid Location', 'velocity', 'geometry_x', 'geometry_y'], axis=1)
# add ride value
df_omd_grid['value'] = 1
df_omd_grid.head()
df_omd_grid.shape
# create origin-destination matrix
odm_matrix = df_omd_grid.pivot_table(index='index_right_x', columns='index_right_y', values='value', aggfunc=np.sum).fillna(0)
print(odm_matrix.shape)